library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.5
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.5
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(psych)
## Warning: package 'psych' was built under R version 4.0.5
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.0 v forcats 0.5.1
## v purrr 0.3.4
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x psych::%+%() masks ggplot2::%+%()
## x psych::alpha() masks ggplot2::alpha()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::src() masks Hmisc::src()
## x dplyr::summarize() masks Hmisc::summarize()
library(skimr)
## Warning: package 'skimr' was built under R version 4.0.5
library(purrr)
library(tidyr)
library(tidyverse)
dfTrain <- read.csv("D:\\RStudio\\621\\Baseball\\moneyball-training-data.csv", header=TRUE)
dfTrain2 <- dfTrain
dim(dfTrain)
## [1] 2276 17
summary(dfTrain)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 630.8 1st Qu.: 71.00 1st Qu.:1383 1st Qu.:208.0
## Median :1270.5 Median : 82.00 Median :1454 Median :238.0
## Mean :1268.5 Mean : 80.79 Mean :1469 Mean :241.2
## 3rd Qu.:1915.5 3rd Qu.: 92.00 3rd Qu.:1537 3rd Qu.:273.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
##
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.00 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 42.00 1st Qu.:451.0 1st Qu.: 548.0
## Median : 47.00 Median :102.00 Median :512.0 Median : 750.0
## Mean : 55.25 Mean : 99.61 Mean :501.6 Mean : 735.6
## 3rd Qu.: 72.00 3rd Qu.:147.00 3rd Qu.:580.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.00 Max. :878.0 Max. :1399.0
## NA's :102
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H
## Min. : 0.0 Min. : 0.0 Min. :29.00 Min. : 1137
## 1st Qu.: 66.0 1st Qu.: 38.0 1st Qu.:50.50 1st Qu.: 1419
## Median :101.0 Median : 49.0 Median :58.00 Median : 1518
## Mean :124.8 Mean : 52.8 Mean :59.36 Mean : 1779
## 3rd Qu.:156.0 3rd Qu.: 62.0 3rd Qu.:67.00 3rd Qu.: 1682
## Max. :697.0 Max. :201.0 Max. :95.00 Max. :30132
## NA's :131 NA's :772 NA's :2085
## TEAM_PITCHING_HR TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 65.0
## 1st Qu.: 50.0 1st Qu.: 476.0 1st Qu.: 615.0 1st Qu.: 127.0
## Median :107.0 Median : 536.5 Median : 813.5 Median : 159.0
## Mean :105.7 Mean : 553.0 Mean : 817.7 Mean : 246.5
## 3rd Qu.:150.0 3rd Qu.: 611.0 3rd Qu.: 968.0 3rd Qu.: 249.2
## Max. :343.0 Max. :3645.0 Max. :19278.0 Max. :1898.0
## NA's :102
## TEAM_FIELDING_DP
## Min. : 52.0
## 1st Qu.:131.0
## Median :149.0
## Mean :146.4
## 3rd Qu.:164.0
## Max. :228.0
## NA's :286
skim(dfTrain)
| Name | dfTrain |
| Number of rows | 2276 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| numeric | 17 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| INDEX | 0 | 1.00 | 1268.46 | 736.35 | 1 | 630.75 | 1270.5 | 1915.50 | 2535 | ▇▇▇▇▇ |
| TARGET_WINS | 0 | 1.00 | 80.79 | 15.75 | 0 | 71.00 | 82.0 | 92.00 | 146 | ▁▁▇▅▁ |
| TEAM_BATTING_H | 0 | 1.00 | 1469.27 | 144.59 | 891 | 1383.00 | 1454.0 | 1537.25 | 2554 | ▁▇▂▁▁ |
| TEAM_BATTING_2B | 0 | 1.00 | 241.25 | 46.80 | 69 | 208.00 | 238.0 | 273.00 | 458 | ▁▆▇▂▁ |
| TEAM_BATTING_3B | 0 | 1.00 | 55.25 | 27.94 | 0 | 34.00 | 47.0 | 72.00 | 223 | ▇▇▂▁▁ |
| TEAM_BATTING_HR | 0 | 1.00 | 99.61 | 60.55 | 0 | 42.00 | 102.0 | 147.00 | 264 | ▇▆▇▅▁ |
| TEAM_BATTING_BB | 0 | 1.00 | 501.56 | 122.67 | 0 | 451.00 | 512.0 | 580.00 | 878 | ▁▁▇▇▁ |
| TEAM_BATTING_SO | 102 | 0.96 | 735.61 | 248.53 | 0 | 548.00 | 750.0 | 930.00 | 1399 | ▁▆▇▇▁ |
| TEAM_BASERUN_SB | 131 | 0.94 | 124.76 | 87.79 | 0 | 66.00 | 101.0 | 156.00 | 697 | ▇▃▁▁▁ |
| TEAM_BASERUN_CS | 772 | 0.66 | 52.80 | 22.96 | 0 | 38.00 | 49.0 | 62.00 | 201 | ▃▇▁▁▁ |
| TEAM_BATTING_HBP | 2085 | 0.08 | 59.36 | 12.97 | 29 | 50.50 | 58.0 | 67.00 | 95 | ▂▇▇▅▁ |
| TEAM_PITCHING_H | 0 | 1.00 | 1779.21 | 1406.84 | 1137 | 1419.00 | 1518.0 | 1682.50 | 30132 | ▇▁▁▁▁ |
| TEAM_PITCHING_HR | 0 | 1.00 | 105.70 | 61.30 | 0 | 50.00 | 107.0 | 150.00 | 343 | ▇▇▆▁▁ |
| TEAM_PITCHING_BB | 0 | 1.00 | 553.01 | 166.36 | 0 | 476.00 | 536.5 | 611.00 | 3645 | ▇▁▁▁▁ |
| TEAM_PITCHING_SO | 102 | 0.96 | 817.73 | 553.09 | 0 | 615.00 | 813.5 | 968.00 | 19278 | ▇▁▁▁▁ |
| TEAM_FIELDING_E | 0 | 1.00 | 246.48 | 227.77 | 65 | 127.00 | 159.0 | 249.25 | 1898 | ▇▁▁▁▁ |
| TEAM_FIELDING_DP | 286 | 0.87 | 146.39 | 26.23 | 52 | 131.00 | 149.0 | 164.00 | 228 | ▁▂▇▆▁ |
str(dfTrain)
## 'data.frame': 2276 obs. of 17 variables:
## $ INDEX : int 1 2 3 4 5 6 7 8 11 12 ...
## $ TARGET_WINS : int 39 70 86 70 82 75 80 85 86 76 ...
## $ TEAM_BATTING_H : int 1445 1339 1377 1387 1297 1279 1244 1273 1391 1271 ...
## $ TEAM_BATTING_2B : int 194 219 232 209 186 200 179 171 197 213 ...
## $ TEAM_BATTING_3B : int 39 22 35 38 27 36 54 37 40 18 ...
## $ TEAM_BATTING_HR : int 13 190 137 96 102 92 122 115 114 96 ...
## $ TEAM_BATTING_BB : int 143 685 602 451 472 443 525 456 447 441 ...
## $ TEAM_BATTING_SO : int 842 1075 917 922 920 973 1062 1027 922 827 ...
## $ TEAM_BASERUN_SB : int NA 37 46 43 49 107 80 40 69 72 ...
## $ TEAM_BASERUN_CS : int NA 28 27 30 39 59 54 36 27 34 ...
## $ TEAM_BATTING_HBP: int NA NA NA NA NA NA NA NA NA NA ...
## $ TEAM_PITCHING_H : int 9364 1347 1377 1396 1297 1279 1244 1281 1391 1271 ...
## $ TEAM_PITCHING_HR: int 84 191 137 97 102 92 122 116 114 96 ...
## $ TEAM_PITCHING_BB: int 927 689 602 454 472 443 525 459 447 441 ...
## $ TEAM_PITCHING_SO: int 5456 1082 917 928 920 973 1062 1033 922 827 ...
## $ TEAM_FIELDING_E : int 1011 193 175 164 138 123 136 112 127 131 ...
## $ TEAM_FIELDING_DP: int NA 155 153 156 168 149 186 136 169 159 ...
for(i in 2:ncol(dfTrain)) {
print(ggplot(dfTrain, aes(dfTrain[ , i])) +
coord_flip() +
xlab(colnames(dfTrain)[i]) +
geom_boxplot())
print(head(sort(dfTrain[,i])))
print(tail(sort(dfTrain[,i])))
}
## [1] 0 12 14 17 21 22
## [1] 126 128 129 134 135 146
## [1] 891 992 1009 1116 1122 1137
## [1] 2305 2333 2343 2372 2496 2554
## [1] 69 112 113 118 123 125
## [1] 378 382 392 393 403 458
## [1] 0 0 8 9 11 12
## [1] 165 166 190 197 200 223
## [1] 0 0 0 0 0 0
## [1] 246 247 249 257 260 264
## [1] 0 12 29 34 45 45
## [1] 806 815 819 824 860 878
## Warning: Removed 102 rows containing non-finite values (stat_boxplot).
## [1] 0 0 0 0 0 0
## [1] 1273 1303 1320 1326 1335 1399
## Warning: Removed 131 rows containing non-finite values (stat_boxplot).
## [1] 0 0 14 18 18 18
## [1] 558 562 567 632 654 697
## Warning: Removed 772 rows containing non-finite values (stat_boxplot).
## [1] 0 7 11 12 14 14
## [1] 171 186 193 200 200 201
## Warning: Removed 2085 rows containing non-finite values (stat_boxplot).
## [1] 29 29 30 35 35 35
## [1] 89 89 89 89 90 95
## [1] 1137 1168 1184 1187 1202 1202
## [1] 14749 16038 16871 20088 24057 30132
## [1] 0 0 0 0 0 0
## [1] 291 297 301 320 320 343
## [1] 0 119 124 131 140 144
## [1] 1750 2169 2396 2840 2876 3645
## Warning: Removed 102 rows containing non-finite values (stat_boxplot).
## [1] 0 0 0 0 0 0
## [1] 2492 3450 4224 5456 12758 19278
## [1] 65 66 68 72 74 77
## [1] 1553 1567 1728 1740 1890 1898
## Warning: Removed 286 rows containing non-finite values (stat_boxplot).
## [1] 52 64 68 71 72 72
## [1] 215 215 218 219 225 228
There are 4 categories where 0s may be nas: Pitching and Batting HR and Pitching and batting SO. We look more closely at these categories:
dfTrain_ZeroAsNA <- dfTrain %>%
dplyr::select(TEAM_PITCHING_SO, TEAM_PITCHING_HR, TEAM_BATTING_SO, TEAM_BATTING_HR)
hist(dfTrain_ZeroAsNA)
Will do nothing with outliers or na as zero for now
Team_Batting_HPBA has too many so we remove it:
dfTrain2 <- dfTrain2 %>%
dplyr::select(-TEAM_BATTING_HBP)
Before we impute the values for NAs, we need to ensure there isn’t any kind of grouping effect for the records with NA. Fact that several columns have the same number of missings suggests there might be. So first we look to see if the missings are collinear:
dfTrain2 <- dfTrain2 %>%
mutate(Missing_Flag = ifelse(is.na(TEAM_BATTING_SO),1,0))
dfTrain3 <- dfTrain2 %>%
dplyr::filter(Missing_Flag == 0) %>%
dplyr::select(TEAM_BATTING_SO, TEAM_PITCHING_SO, TEAM_BASERUN_CS, TEAM_BASERUN_SB)
summary(dfTrain3)
## TEAM_BATTING_SO TEAM_PITCHING_SO TEAM_BASERUN_CS TEAM_BASERUN_SB
## Min. : 0.0 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 548.0 1st Qu.: 615.0 1st Qu.: 38.0 1st Qu.: 65.0
## Median : 750.0 Median : 813.5 Median : 49.0 Median : 98.0
## Mean : 735.6 Mean : 817.7 Mean : 52.8 Mean :120.8
## 3rd Qu.: 930.0 3rd Qu.: 968.0 3rd Qu.: 62.0 3rd Qu.:147.0
## Max. :1399.0 Max. :19278.0 Max. :201.0 Max. :697.0
## NA's :670 NA's :131
There is some cohort effect as there is complete duplication with pitching so and batting so, and some overlap with baserun cs. Now lets impute the median and see how well the new modelperforms vs the old:
dfTrain_ImputedMedian <- data.frame(
sapply(dfTrain2, function(x) ifelse(is.na(x), median(x, na.rm = TRUE), x)))
dfTrain_ImputedMean <- data.frame(
sapply(dfTrain2, function(x) ifelse(is.na(x), mean(x, na.rm = TRUE), x)))
m1 <- lm(TARGET_WINS ~ ., dfTrain2)
m2 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMedian)
m3 <- lm(TARGET_WINS ~ ., dfTrain_ImputedMean)
summary(m1)$adj.r.squared
## [1] 0.4330872
summary(m2)$adj.r.squared
## [1] 0.313437
summary(m3)$adj.r.squared
## [1] 0.3169625
There appears to be a large effect.
Now we can look at interactions between the “cohort” and other variables:
par(mfcol=c(2,2))
dfTrain_ImputedMean$Missing_Flag <- as.factor(dfTrain_ImputedMean$Missing_Flag)
for(i in 2:ncol(dfTrain_ImputedMean)) {
print(ggplot(dfTrain_ImputedMean, aes(dfTrain_ImputedMean[ ,i], TARGET_WINS, color=Missing_Flag)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE) +
ggtitle(colnames(dfTrain_ImputedMean)[i]))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
The interaction analysis suggests that the cohort is not random - there are numerous interactions with many other variables, some of which are quite counterinutitive (team pitching H). So we could either do a random effects/flag/interactions or toss them. Becuase bad data is not reproducible I will toss, at the expense of better predicitons if I can identify the cohort in the eval data.
dfTrain_ImputedMean_NoCohort <- dfTrain_ImputedMean %>%
filter(Missing_Flag==0) %>%
dplyr::select(-Missing_Flag)
summary(dfTrain_ImputedMean_NoCohort)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## Min. : 1.0 Min. : 0.00 Min. : 891 Min. : 69.0
## 1st Qu.: 640.2 1st Qu.: 71.00 1st Qu.:1389 1st Qu.:211.2
## Median :1275.5 Median : 82.00 Median :1458 Median :240.0
## Mean :1275.2 Mean : 80.76 Mean :1475 Mean :243.9
## 3rd Qu.:1923.8 3rd Qu.: 91.00 3rd Qu.:1541 3rd Qu.:275.0
## Max. :2535.0 Max. :146.00 Max. :2554 Max. :458.0
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO
## Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 0.0
## 1st Qu.: 34.00 1st Qu.: 48.0 1st Qu.:456.0 1st Qu.: 548.0
## Median : 46.00 Median :107.0 Median :517.0 Median : 750.0
## Mean : 54.45 Mean :103.4 Mean :505.1 Mean : 735.6
## 3rd Qu.: 71.00 3rd Qu.:148.0 3rd Qu.:582.0 3rd Qu.: 930.0
## Max. :223.00 Max. :264.0 Max. :878.0 Max. :1399.0
## TEAM_BASERUN_SB TEAM_BASERUN_CS TEAM_PITCHING_H TEAM_PITCHING_HR
## Min. : 0.0 Min. : 0.0 Min. : 1137 Min. : 0.0
## 1st Qu.: 66.0 1st Qu.: 44.0 1st Qu.: 1425 1st Qu.: 58.0
## Median :102.0 Median : 52.8 Median : 1521 Median :111.0
## Mean :121.1 Mean : 52.8 Mean : 1794 Mean :109.7
## 3rd Qu.:143.8 3rd Qu.: 55.0 3rd Qu.: 1694 3rd Qu.:152.8
## Max. :697.0 Max. :201.0 Max. :30132 Max. :343.0
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## Min. : 0.0 Min. : 0.0 Min. : 65.0 Min. : 52.0
## 1st Qu.: 479.2 1st Qu.: 615.0 1st Qu.: 126.0 1st Qu.:137.0
## Median : 542.0 Median : 813.5 Median : 155.0 Median :146.4
## Mean : 557.5 Mean : 817.7 Mean : 243.9 Mean :148.6
## 3rd Qu.: 614.8 3rd Qu.: 968.0 3rd Qu.: 234.0 3rd Qu.:162.0
## Max. :3645.0 Max. :19278.0 Max. :1898.0 Max. :228.0
Curious to look at impact of imputing median on correlation:
summary(lm(dfTrain$TARGET_WINS ~ dfTrain$TEAM_PITCHING_SO))
##
## Call:
## lm(formula = dfTrain$TARGET_WINS ~ dfTrain$TEAM_PITCHING_SO)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.570 -9.402 0.970 10.484 63.430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.5704787 0.5945630 138.876 < 2e-16 ***
## dfTrain$TEAM_PITCHING_SO -0.0022085 0.0006023 -3.667 0.000252 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.53 on 2172 degrees of freedom
## (102 observations deleted due to missingness)
## Multiple R-squared: 0.006152, Adjusted R-squared: 0.005695
## F-statistic: 13.45 on 1 and 2172 DF, p-value: 0.0002515
summary(lm(dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian$TEAM_PITCHING_SO))
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian$TEAM_PITCHING_SO)
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.597 -9.554 0.980 10.633 63.403
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.5967253 0.5971665 138.314 < 2e-16
## dfTrain_ImputedMedian$TEAM_PITCHING_SO -0.0022089 0.0006093 -3.625 0.000295
##
## (Intercept) ***
## dfTrain_ImputedMedian$TEAM_PITCHING_SO ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.71 on 2274 degrees of freedom
## Multiple R-squared: 0.005746, Adjusted R-squared: 0.005308
## F-statistic: 13.14 on 1 and 2274 DF, p-value: 0.0002953
Th effect is minimal.
dfTrain %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3478 rows containing non-finite values (stat_bin).
dfTrain %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_boxplot() +
coord_flip()
## Warning: Removed 3478 rows containing non-finite values (stat_boxplot).
Outlier analysis again
for(i in 2:ncol(dfTrain_ImputedMean_NoCohort)) {
print(ggplot(dfTrain_ImputedMean_NoCohort, aes(dfTrain_ImputedMean_NoCohort[ , i])) +
coord_flip() +
xlab(colnames(dfTrain_ImputedMean_NoCohort)[i]) +
geom_boxplot())
print(head(sort(dfTrain_ImputedMean_NoCohort[,i])))
print(tail(sort(dfTrain_ImputedMean_NoCohort[,i])))
}
## [1] 0 12 14 17 21 22
## [1] 126 128 129 134 135 146
## [1] 891 992 1009 1116 1122 1137
## [1] 2305 2333 2343 2372 2496 2554
## [1] 69 112 113 118 127 130
## [1] 378 382 392 393 403 458
## [1] 0 0 8 9 11 12
## [1] 165 166 190 197 200 223
## [1] 0 0 0 0 0 0
## [1] 246 247 249 257 260 264
## [1] 0 12 29 34 45 45
## [1] 806 815 819 824 860 878
## [1] 0 0 0 0 0 0
## [1] 1273 1303 1320 1326 1335 1399
## [1] 0 0 14 18 18 18
## [1] 558 562 567 632 654 697
## [1] 0 7 11 12 14 14
## [1] 171 186 193 200 200 201
## [1] 1137 1168 1184 1187 1202 1202
## [1] 14749 16038 16871 20088 24057 30132
## [1] 0 0 0 0 0 0
## [1] 291 297 301 320 320 343
## [1] 0 119 124 131 140 144
## [1] 1750 2169 2396 2840 2876 3645
## [1] 0 0 0 0 0 0
## [1] 2492 3450 4224 5456 12758 19278
## [1] 65 66 68 72 74 77
## [1] 1553 1567 1728 1740 1890 1898
## [1] 52 64 71 72 75 78
## [1] 215 215 218 219 225 228
hist(dfTrain$TARGET_WINS, bins=20)
## Warning in plot.window(xlim, ylim, "", ...): "bins" is not a graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...): "bins"
## is not a graphical parameter
## Warning in axis(1, ...): "bins" is not a graphical parameter
## Warning in axis(2, ...): "bins" is not a graphical parameter
head(sort(dfTrain$TARGET_WINS))
## [1] 0 12 14 17 21 22
dfTrain_ZeroWins <- dfTrain %>%
dplyr::filter(TARGET_WINS ==0)
head(dfTrain_ZeroWins, 1)
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B TEAM_BATTING_3B
## 1 1347 0 891 135 0
## TEAM_BATTING_HR TEAM_BATTING_BB TEAM_BATTING_SO TEAM_BASERUN_SB
## 1 0 0 0 0
## TEAM_BASERUN_CS TEAM_BATTING_HBP TEAM_PITCHING_H TEAM_PITCHING_HR
## 1 0 NA 24057 0
## TEAM_PITCHING_BB TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## 1 0 0 1890 NA
Target_Wins appears normally distributed - the zero is suspicious but I’m going to leave it.
dfCor <- as.data.frame(cor(dfTrain_ImputedMean_NoCohort))
dfCor
## INDEX TARGET_WINS TEAM_BATTING_H TEAM_BATTING_2B
## INDEX 1.0000000000 -0.02928140 -0.03131390 -0.003976934
## TARGET_WINS -0.0292813985 1.00000000 0.39476995 0.293205037
## TEAM_BATTING_H -0.0313139014 0.39476995 1.00000000 0.540648272
## TEAM_BATTING_2B -0.0039769341 0.29320504 0.54064827 1.000000000
## TEAM_BATTING_3B -0.0049758496 0.13685882 0.45802046 -0.085325497
## TEAM_BATTING_HR 0.0413809930 0.19059035 -0.06194956 0.393641975
## TEAM_BATTING_BB -0.0358540809 0.23250609 -0.10545406 0.230196649
## TEAM_BATTING_SO 0.0814501106 -0.03175071 -0.46385357 0.162685188
## TEAM_BASERUN_SB 0.0435154747 0.11143414 0.14886129 -0.153728585
## TEAM_BASERUN_CS 0.0004632733 0.01610843 0.01198251 -0.077632602
## TEAM_PITCHING_H 0.0146890757 -0.11576530 0.29979491 0.008872511
## TEAM_PITCHING_HR 0.0403725584 0.20531868 0.02082589 0.412455481
## TEAM_PITCHING_BB -0.0233549401 0.12063924 0.07067846 0.149565361
## TEAM_PITCHING_SO 0.0558901457 -0.07843609 -0.25265679 0.064792315
## TEAM_FIELDING_E -0.0068738726 -0.17639551 0.28252119 -0.232247607
## TEAM_FIELDING_DP 0.0061318975 -0.02860414 0.04535652 0.178563220
## TEAM_BATTING_3B TEAM_BATTING_HR TEAM_BATTING_BB
## INDEX -0.00497585 0.04138099 -0.03585408
## TARGET_WINS 0.13685882 0.19059035 0.23250609
## TEAM_BATTING_H 0.45802046 -0.06194956 -0.10545406
## TEAM_BATTING_2B -0.08532550 0.39364197 0.23019665
## TEAM_BATTING_3B 1.00000000 -0.63765753 -0.28160593
## TEAM_BATTING_HR -0.63765753 1.00000000 0.50439692
## TEAM_BATTING_BB -0.28160593 0.50439692 1.00000000
## TEAM_BATTING_SO -0.66978119 0.72706935 0.37975087
## TEAM_BASERUN_SB 0.49301668 -0.39942181 -0.06545891
## TEAM_BASERUN_CS 0.19833581 -0.30347433 -0.08612025
## TEAM_PITCHING_H 0.20396690 -0.27656010 -0.46585690
## TEAM_PITCHING_HR -0.56629509 0.96659392 0.44681242
## TEAM_PITCHING_BB 0.01294580 0.10677385 0.47385394
## TEAM_PITCHING_SO -0.25881893 0.18470756 -0.02075682
## TEAM_FIELDING_E 0.51354615 -0.59891151 -0.66138116
## TEAM_FIELDING_DP -0.21908499 0.33368751 0.32158157
## TEAM_BATTING_SO TEAM_BASERUN_SB TEAM_BASERUN_CS
## INDEX 0.08145011 0.04351547 0.0004632733
## TARGET_WINS -0.03175071 0.11143414 0.0161084320
## TEAM_BATTING_H -0.46385357 0.14886129 0.0119825143
## TEAM_BATTING_2B 0.16268519 -0.15372858 -0.0776326024
## TEAM_BATTING_3B -0.66978119 0.49301668 0.1983358054
## TEAM_BATTING_HR 0.72706935 -0.39942181 -0.3034743273
## TEAM_BATTING_BB 0.37975087 -0.06545891 -0.0861202523
## TEAM_BATTING_SO 1.00000000 -0.23837153 -0.1566149092
## TEAM_BASERUN_SB -0.23837153 1.00000000 0.2869124889
## TEAM_BASERUN_CS -0.15661491 0.28691249 1.0000000000
## TEAM_PITCHING_H -0.37568637 0.07198568 -0.0369545996
## TEAM_PITCHING_HR 0.66717889 -0.36564098 -0.3034478040
## TEAM_PITCHING_BB 0.03700514 0.14323815 -0.0542531880
## TEAM_PITCHING_SO 0.41623330 -0.05615058 -0.0686217842
## TEAM_FIELDING_E -0.58466444 0.36999309 0.0236201201
## TEAM_FIELDING_DP 0.14599850 -0.24957358 -0.1563091914
## TEAM_PITCHING_H TEAM_PITCHING_HR TEAM_PITCHING_BB
## INDEX 0.014689076 0.04037256 -0.02335494
## TARGET_WINS -0.115765302 0.20531868 0.12063924
## TEAM_BATTING_H 0.299794910 0.02082589 0.07067846
## TEAM_BATTING_2B 0.008872511 0.41245548 0.14956536
## TEAM_BATTING_3B 0.203966905 -0.56629509 0.01294580
## TEAM_BATTING_HR -0.276560100 0.96659392 0.10677385
## TEAM_BATTING_BB -0.465856896 0.44681242 0.47385394
## TEAM_BATTING_SO -0.375686369 0.66717889 0.03700514
## TEAM_BASERUN_SB 0.071985680 -0.36564098 0.14323815
## TEAM_BASERUN_CS -0.036954600 -0.30344780 -0.05425319
## TEAM_PITCHING_H 1.000000000 -0.16448724 0.31845282
## TEAM_PITCHING_HR -0.164487236 1.00000000 0.19575531
## TEAM_PITCHING_BB 0.318452818 0.19575531 1.00000000
## TEAM_PITCHING_SO 0.267248074 0.20588053 0.48849865
## TEAM_FIELDING_E 0.672838853 -0.50175814 -0.01637592
## TEAM_FIELDING_DP -0.088957308 0.32336753 0.15211734
## TEAM_PITCHING_SO TEAM_FIELDING_E TEAM_FIELDING_DP
## INDEX 0.05589015 -0.006873873 0.006131897
## TARGET_WINS -0.07843609 -0.176395507 -0.028604138
## TEAM_BATTING_H -0.25265679 0.282521195 0.045356517
## TEAM_BATTING_2B 0.06479231 -0.232247607 0.178563220
## TEAM_BATTING_3B -0.25881893 0.513546149 -0.219084985
## TEAM_BATTING_HR 0.18470756 -0.598911507 0.333687510
## TEAM_BATTING_BB -0.02075682 -0.661381160 0.321581568
## TEAM_BATTING_SO 0.41623330 -0.584664436 0.145998500
## TEAM_BASERUN_SB -0.05615058 0.369993094 -0.249573580
## TEAM_BASERUN_CS -0.06862178 0.023620120 -0.156309191
## TEAM_PITCHING_H 0.26724807 0.672838853 -0.088957308
## TEAM_PITCHING_HR 0.20588053 -0.501758136 0.323367525
## TEAM_PITCHING_BB 0.48849865 -0.016375919 0.152117341
## TEAM_PITCHING_SO 1.00000000 -0.023291783 0.010392318
## TEAM_FIELDING_E -0.02329178 1.000000000 -0.257897297
## TEAM_FIELDING_DP 0.01039232 -0.257897297 1.000000000
heatmap(as.matrix(dfCor), Rowv = NA, Colv = NA)
Invsteigate suspicious HR category
cor.test(dfTrain$TEAM_PITCHING_HR, dfTrain$TARGET_WINS)
##
## Pearson's product-moment correlation
##
## data: dfTrain$TEAM_PITCHING_HR and dfTrain$TARGET_WINS
## t = 9.1789, df = 2274, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1490846 0.2283275
## sample estimates:
## cor
## 0.1890137
ggplot(dfTrain, aes(TEAM_PITCHING_HR, TEAM_BATTING_HR, color=INDEX)) +
geom_point()
hist(dfTrain$TEAM_PITCHING_HR, breaks=100)
plot(dfTrain$TEAM_PITCHING_HR, dfTrain$TARGET_WINS)
m1 <- lm(TARGET_WINS ~ TEAM_PITCHING_HR, data=dfTrain)
summary(m1)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_HR, data = dfTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.657 -9.956 0.636 10.055 67.477
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.656920 0.646540 117.018 <2e-16 ***
## TEAM_PITCHING_HR 0.048572 0.005292 9.179 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.47 on 2274 degrees of freedom
## Multiple R-squared: 0.03573, Adjusted R-squared: 0.0353
## F-statistic: 84.25 on 1 and 2274 DF, p-value: < 2.2e-16
plot(m1)
library(car)
## Warning: package 'car' was built under R version 4.0.5
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:psych':
##
## logit
influencePlot(m1, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter
## StudRes Hat CookD
## 299 4.380293 0.0006944747 0.0066141630
## 832 0.173993 0.0070267976 0.0001071615
## 964 -1.050146 0.0058117315 0.0032231964
## 1211 -4.919225 0.0017463018 0.0209523937
## 2233 -4.132563 0.0017463018 0.0148329515
dfTrain2 <- dfTrain[-c(1211,2233,299,1825, 832), ]
cor.test(dfTrain2$TEAM_PITCHING_HR, dfTrain2$TARGET_WINS)
##
## Pearson's product-moment correlation
##
## data: dfTrain2$TEAM_PITCHING_HR and dfTrain2$TARGET_WINS
## t = 8.8525, df = 2269, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1426547 0.2221771
## sample estimates:
## cor
## 0.1827147
m2 <- lm(TARGET_WINS ~ TEAM_PITCHING_HR, data=dfTrain2)
summary(m2)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_HR, data = dfTrain2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.949 -9.929 0.614 10.028 55.992
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.948820 0.639361 118.789 <2e-16 ***
## TEAM_PITCHING_HR 0.046356 0.005237 8.852 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.23 on 2269 degrees of freedom
## Multiple R-squared: 0.03338, Adjusted R-squared: 0.03296
## F-statistic: 78.37 on 1 and 2269 DF, p-value: < 2.2e-16
plot(m2)
library(car)
influencePlot(m2, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter
## StudRes Hat CookD
## 964 -1.039523 0.0058683344 0.003189286
## 982 -3.886600 0.0017628831 0.013255859
## 1810 2.114722 0.0049482791 0.011102505
## 1882 -1.303158 0.0058683344 0.005010737
## 2012 3.688318 0.0006272236 0.004245374
summary(m1$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -75.6569 -9.9562 0.6359 0.0000 10.0552 67.4774
describe(m1$residuals)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 2276 0 15.47 0.64 0.2 14.84 -75.66 67.48 143.13 -0.18 0.86
## se
## X1 0.32
dfTrain$Residuals <- m1$residuals
dfTrain$Fitted <- m1$fitted.values
library(tidyverse)
dfTrain_WithoutHR <- dfTrain %>%
dplyr::filter(TARGET_WINS >=50 | TEAM_PITCHING_HR!=0)
hist(dfTrain_WithoutHR$TEAM_PITCHING_HR)
plot(dfTrain_WithoutHR$TEAM_PITCHING_HR, dfTrain_WithoutHR$TARGET_WINS)
m3 <- lm(TARGET_WINS ~ TEAM_PITCHING_HR, data=dfTrain_WithoutHR)
summary(m3)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_HR, data = dfTrain_WithoutHR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -56.208 -9.802 0.653 9.952 66.914
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.624136 0.636539 120.376 < 2e-16 ***
## TEAM_PITCHING_HR 0.041723 0.005197 8.028 1.58e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.09 on 2263 degrees of freedom
## Multiple R-squared: 0.02769, Adjusted R-squared: 0.02726
## F-statistic: 64.45 on 1 and 2263 DF, p-value: 1.576e-15
plot(m3)
library(car)
influencePlot(m3, id.method='identify', main='Influence Plot', sub='Circle size is proportional to Cook’s distance')
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter
## StudRes Hat CookD
## 299 4.4557422 0.0007060697 0.0069560265
## 829 0.2703629 0.0070966028 0.0002613277
## 856 -3.7394216 0.0014507753 0.0101000850
## 961 -0.9956483 0.0058665293 0.0029249611
## 1804 2.1826581 0.0049451007 0.0118181032
dfTrain_BiModal <- dfTrain %>%
mutate(HR_Low = if_else(TEAM_PITCHING_HR<50,1,0)) %>%
mutate(HR_High = if_else(TEAM_PITCHING_HR>=50,1,0))
dfCor_BiModal <- as.data.frame(cor(dfTrain_BiModal))
m4 <- lm(TARGET_WINS ~ TEAM_PITCHING_HR + HR_Low, data=dfTrain_BiModal)
summary(m4)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_HR + HR_Low, data = dfTrain_BiModal)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.692 -9.976 0.653 10.058 67.556
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.529253 1.069339 70.632 < 2e-16 ***
## TEAM_PITCHING_HR 0.049398 0.007641 6.465 1.24e-10 ***
## HR_Low 0.162504 1.084033 0.150 0.881
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.47 on 2273 degrees of freedom
## Multiple R-squared: 0.03574, Adjusted R-squared: 0.03489
## F-statistic: 42.12 on 2 and 2273 DF, p-value: < 2.2e-16
plot(m4)
dfHighHR <- dfTrain_BiModal %>%
dplyr::filter(HR_High ==1)
dfLowHR <- dfTrain_BiModal %>%
dplyr::filter(HR_Low==1)
t.test(dfLowHR$TARGET_WINS, dfHighHR$TARGET_WINS)
##
## Welch Two Sample t-test
##
## data: dfLowHR$TARGET_WINS and dfHighHR$TARGET_WINS
## t = -5.4141, df = 753, p-value = 8.291e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.665804 -3.118167
## sample estimates:
## mean of x mean of y
## 77.11327 82.00526
m5 <- lm(TARGET_WINS ~ TEAM_PITCHING_HR, data=dfHighHR)
summary(m5)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_HR, data = dfHighHR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.641 -9.293 0.650 9.127 67.238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.107959 0.957161 79.514 < 2e-16 ***
## TEAM_PITCHING_HR 0.044983 0.006848 6.569 6.72e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.73 on 1709 degrees of freedom
## Multiple R-squared: 0.02463, Adjusted R-squared: 0.02405
## F-statistic: 43.15 on 1 and 1709 DF, p-value: 6.72e-11
plot(m5)
dfCor_HR <- as.data.frame(cor(dfTrain_BiModal[-1], dfTrain_BiModal$TEAM_PITCHING_HR))
dfCor_Low <- as.data.frame(cor(dfTrain_BiModal[-1], dfTrain_BiModal$HR_Low))
plot(dfTrain$TEAM_BATTING_HR, dfTrain$TEAM_PITCHING_HR)
dfTrain$HR_Diff <- dfTrain$TEAM_PITCHING_HR -dfTrain$TEAM_BATTING_HR
hist(dfTrain$HR_Diff, breaks=100)
describe(dfTrain$HR_Diff)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 2276 6.09 15.1 2 2.93 2.97 -2 249 251 6.98 71.83 0.32
Sum of HR allowed greatly exceeds sum of HR hit
m6 <- lm(dfTrain$TEAM_BATTING_HR ~ dfTrain$TEAM_PITCHING_HR)
summary(m6)
##
## Call:
## lm(formula = dfTrain$TEAM_BATTING_HR ~ dfTrain$TEAM_PITCHING_HR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -234.609 0.123 1.336 6.992 12.817
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.592392 0.621547 -2.562 0.0105 *
## dfTrain$TEAM_PITCHING_HR 0.957481 0.005087 188.217 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.87 on 2274 degrees of freedom
## Multiple R-squared: 0.9397, Adjusted R-squared: 0.9397
## F-statistic: 3.543e+04 on 1 and 2274 DF, p-value: < 2.2e-16
plot(m6)
cor.test(dfTrain$TEAM_BATTING_BB, dfTrain$TEAM_PITCHING_BB)
##
## Pearson's product-moment correlation
##
## data: dfTrain$TEAM_BATTING_BB and dfTrain$TEAM_PITCHING_BB
## t = 26.759, df = 2274, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4574724 0.5199930
## sample estimates:
## cor
## 0.4893613
plot(dfTrain$TEAM_BATTING_BB, dfTrain$TEAM_PITCHING_BB)
dfTrain_ImputedMedian <- dfTrain_ImputedMean_NoCohort
for(i in 2:ncol(dfTrain_ImputedMedian)) {
print(ggplot(dfTrain_ImputedMedian, aes(dfTrain_ImputedMedian[ , i], x = dfTrain_ImputedMedian$TARGET_WINS)) +
xlab(colnames(dfTrain)[i]) +
stat_smooth(method=loess) +
geom_point())
m <- lm(dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,i])
par(mfcol=c(2,2))
print(summary(m))
print(plot(m))
}
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
## Warning in summary.lm(m): essentially perfect fit: summary may be unreliable
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.647e-14 -1.120e-15 -7.000e-16 -2.800e-16 1.614e-12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.756e-13 3.928e-15 4.470e+01 <2e-16 ***
## dfTrain_ImputedMedian[, i] 1.000e+00 4.775e-17 2.094e+16 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.467e-14 on 2172 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 4.385e+32 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -71.761 -8.515 0.971 9.783 43.230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.686332 3.164963 5.588 2.58e-08 ***
## dfTrain_ImputedMedian[, i] 0.042775 0.002136 20.025 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.31 on 2172 degrees of freedom
## Multiple R-squared: 0.1558, Adjusted R-squared: 0.1555
## F-statistic: 401 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.863 -9.376 0.670 10.121 57.415
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.346919 1.737969 32.42 <2e-16 ***
## dfTrain_ImputedMedian[, i] 0.100118 0.007005 14.29 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.89 on 2172 degrees of freedom
## Multiple R-squared: 0.08597, Adjusted R-squared: 0.08555
## F-statistic: 204.3 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -76.628 -8.980 1.143 10.428 60.940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 76.62804 0.72265 106.038 < 2e-16 ***
## dfTrain_ImputedMedian[, i] 0.07596 0.01180 6.439 1.48e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.43 on 2172 degrees of freedom
## Multiple R-squared: 0.01873, Adjusted R-squared: 0.01828
## F-statistic: 41.46 on 1 and 2172 DF, p-value: 1.477e-10
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -75.596 -9.734 0.553 10.041 68.954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.595947 0.658670 114.771 <2e-16 ***
## dfTrain_ImputedMedian[, i] 0.050009 0.005527 9.048 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.29 on 2172 degrees of freedom
## Multiple R-squared: 0.03632, Adjusted R-squared: 0.03588
## F-statistic: 81.87 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.936 -9.554 0.579 9.674 78.185
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 65.935670 1.370076 48.13 <2e-16 ***
## dfTrain_ImputedMedian[, i] 0.029358 0.002635 11.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.15 on 2172 degrees of freedom
## Multiple R-squared: 0.05406, Adjusted R-squared: 0.05362
## F-statistic: 124.1 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.228 -9.308 0.963 10.609 63.772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.228036 1.043434 78.81 <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.001990 0.001344 -1.48 0.139
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared: 0.001008, Adjusted R-squared: 0.0005482
## F-statistic: 2.192 on 1 and 2172 DF, p-value: 0.1389
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.284 -9.080 1.024 10.198 65.160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 78.28444 0.57917 135.166 < 2e-16 ***
## dfTrain_ImputedMedian[, i] 0.02048 0.00392 5.226 1.9e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.48 on 2172 degrees of freedom
## Multiple R-squared: 0.01242, Adjusted R-squared: 0.01196
## F-statistic: 27.31 on 1 and 2172 DF, p-value: 1.899e-07
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.071 -9.493 1.233 10.483 65.236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80.07067 0.98260 81.489 <2e-16 ***
## dfTrain_ImputedMedian[, i] 0.01314 0.01750 0.751 0.453
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared: 0.0002595, Adjusted R-squared: -0.0002008
## F-statistic: 0.5637 on 1 and 2172 DF, p-value: 0.4528
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.165 -9.462 0.897 10.651 68.914
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.0150688 0.5308401 156.384 < 2e-16 ***
## dfTrain_ImputedMedian[, i] -0.0012543 0.0002309 -5.432 6.2e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.47 on 2172 degrees of freedom
## Multiple R-squared: 0.0134, Adjusted R-squared: 0.01295
## F-statistic: 29.5 on 1 and 2172 DF, p-value: 6.205e-08
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.906 -9.846 0.705 9.965 67.942
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.905514 0.682649 109.728 <2e-16 ***
## dfTrain_ImputedMedian[, i] 0.053432 0.005465 9.777 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.25 on 2172 degrees of freedom
## Multiple R-squared: 0.04216, Adjusted R-squared: 0.04171
## F-statistic: 95.59 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -74.528 -9.251 0.948 10.415 70.006
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 74.528116 1.149967 64.809 < 2e-16 ***
## dfTrain_ImputedMedian[, i] 0.011187 0.001975 5.664 1.68e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.46 on 2172 degrees of freedom
## Multiple R-squared: 0.01455, Adjusted R-squared: 0.0141
## F-statistic: 32.08 on 1 and 2172 DF, p-value: 1.678e-08
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -82.570 -9.402 0.970 10.484 63.430
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.5704787 0.5945630 138.876 < 2e-16 ***
## dfTrain_ImputedMedian[, i] -0.0022085 0.0006023 -3.667 0.000252 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.53 on 2172 degrees of freedom
## Multiple R-squared: 0.006152, Adjusted R-squared: 0.005695
## F-statistic: 13.45 on 1 and 2172 DF, p-value: 0.0002515
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.638 -9.847 0.708 10.050 73.590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.645750 0.476605 175.503 <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.011815 0.001415 -8.352 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared: 0.03112, Adjusted R-squared: 0.03067
## F-statistic: 69.75 on 1 and 2172 DF, p-value: < 2.2e-16
## NULL
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## Warning: Use of `dfTrain_ImputedMedian$TARGET_WINS` is discouraged. Use
## `TARGET_WINS` instead.
## `geom_smooth()` using formula 'y ~ x'
##
## Call:
## lm(formula = dfTrain_ImputedMedian$TARGET_WINS ~ dfTrain_ImputedMedian[,
## i])
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.809 -9.322 1.075 10.459 65.191
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.70498 2.23001 37.536 <2e-16 ***
## dfTrain_ImputedMedian[, i] -0.01979 0.01484 -1.334 0.182
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared: 0.0008182, Adjusted R-squared: 0.0003582
## F-statistic: 1.779 on 1 and 2172 DF, p-value: 0.1825
## NULL
Trying a transformation on team fielding error. it improves it to some degree.
dfTrain_ImputedMedian2 <- dfTrain_ImputedMedian %>%
mutate(sq = TEAM_FIELDING_E^2)
summary(lm(TARGET_WINS ~ TEAM_FIELDING_E, dfTrain_ImputedMedian2))
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E, data = dfTrain_ImputedMedian2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.638 -9.847 0.708 10.050 73.590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.645750 0.476605 175.503 <2e-16 ***
## TEAM_FIELDING_E -0.011815 0.001415 -8.352 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared: 0.03112, Adjusted R-squared: 0.03067
## F-statistic: 69.75 on 1 and 2172 DF, p-value: < 2.2e-16
summary(lm(TARGET_WINS ~ TEAM_FIELDING_E + sq, dfTrain_ImputedMedian2))
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E + sq, data = dfTrain_ImputedMedian2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.981 -9.787 0.647 10.285 72.647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.985e+01 7.178e-01 111.246 < 2e-16 ***
## TEAM_FIELDING_E 1.386e-02 3.924e-03 3.533 0.000419 ***
## sq -2.177e-05 3.108e-06 -7.005 3.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.17 on 2171 degrees of freedom
## Multiple R-squared: 0.05253, Adjusted R-squared: 0.05165
## F-statistic: 60.18 on 2 and 2171 DF, p-value: < 2.2e-16
#Two mods made - team pitching has the square temr and intreaction between hits and dp
par(mfcol=c(2,2))
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
summary(mod_2)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.264 -8.466 0.163 8.273 58.924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.9560970 5.4876280 4.365 1.33e-05 ***
## INDEX -0.0004771 0.0003788 -1.259 0.207988
## TEAM_BATTING_H 0.0482928 0.0037112 13.013 < 2e-16 ***
## TEAM_BATTING_2B -0.0232530 0.0092311 -2.519 0.011841 *
## TEAM_BATTING_3B 0.0595670 0.0169134 3.522 0.000437 ***
## TEAM_BATTING_HR 0.0655424 0.0272468 2.406 0.016234 *
## TEAM_BATTING_BB 0.0084691 0.0057882 1.463 0.143567
## TEAM_BATTING_SO -0.0100510 0.0025721 -3.908 9.61e-05 ***
## TEAM_BASERUN_SB 0.0254437 0.0044746 5.686 1.47e-08 ***
## TEAM_BASERUN_CS 0.0006521 0.0161429 0.040 0.967780
## TEAM_PITCHING_H -0.0009865 0.0003651 -2.702 0.006949 **
## TEAM_PITCHING_HR 0.0116273 0.0240289 0.484 0.628514
## TEAM_PITCHING_BB 0.0014808 0.0040999 0.361 0.718000
## TEAM_PITCHING_SO 0.0028141 0.0009069 3.103 0.001941 **
## TEAM_FIELDING_E -0.0186779 0.0024906 -7.499 9.31e-14 ***
## TEAM_FIELDING_DP -0.1091373 0.0136377 -8.003 1.97e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.86 on 2158 degrees of freedom
## Multiple R-squared: 0.3226, Adjusted R-squared: 0.3179
## F-statistic: 68.5 on 15 and 2158 DF, p-value: < 2.2e-16
plot(mod_2)
library(MASS)
## Warning: package 'MASS' was built under R version 4.0.5
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
step1 <- stepAIC(mod_2, trace=FALSE)
summary(step1)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_SO + TEAM_FIELDING_E +
## TEAM_FIELDING_DP, data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.153 -8.411 0.176 8.307 58.465
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.6861348 5.2806294 4.296 1.82e-05 ***
## TEAM_BATTING_H 0.0486089 0.0036841 13.194 < 2e-16 ***
## TEAM_BATTING_2B -0.0233877 0.0092203 -2.537 0.011265 *
## TEAM_BATTING_3B 0.0602198 0.0166990 3.606 0.000318 ***
## TEAM_BATTING_HR 0.0770786 0.0097715 7.888 4.83e-15 ***
## TEAM_BATTING_BB 0.0104799 0.0033563 3.122 0.001817 **
## TEAM_BATTING_SO -0.0104007 0.0024834 -4.188 2.93e-05 ***
## TEAM_BASERUN_SB 0.0253857 0.0042813 5.929 3.53e-09 ***
## TEAM_PITCHING_H -0.0008928 0.0003178 -2.809 0.005008 **
## TEAM_PITCHING_SO 0.0030690 0.0006625 4.633 3.82e-06 ***
## TEAM_FIELDING_E -0.0184139 0.0024107 -7.639 3.28e-14 ***
## TEAM_FIELDING_DP -0.1095211 0.0136173 -8.043 1.43e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.86 on 2162 degrees of freedom
## Multiple R-squared: 0.3218, Adjusted R-squared: 0.3184
## F-statistic: 93.27 on 11 and 2162 DF, p-value: < 2.2e-16
Understanding the role of double plays - remove the influence of hits:
ggplot(dfTrain_ImputedMedian, aes(TEAM_FIELDING_DP, TEAM_PITCHING_H)) +
geom_point()
ggplot(dfTrain, aes(TEAM_FIELDING_DP, TEAM_PITCHING_H)) +
geom_point()
## Warning: Removed 286 rows containing missing values (geom_point).
cor(dfTrain_ImputedMedian$TEAM_FIELDING_DP, dfTrain_ImputedMedian$TEAM_PITCHING_H)
## [1] -0.08895731
summary(lm(TARGET_WINS ~ TEAM_FIELDING_DP + TEAM_PITCHING_H, dfTrain))
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_DP + TEAM_PITCHING_H,
## data = dfTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -66.999 -9.102 0.739 10.013 43.146
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 75.0829610 2.4592867 30.530 < 2e-16 ***
## TEAM_FIELDING_DP -0.0045343 0.0121655 -0.373 0.709
## TEAM_PITCHING_H 0.0041845 0.0008319 5.030 5.34e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.85 on 1987 degrees of freedom
## (286 observations deleted due to missingness)
## Multiple R-squared: 0.01377, Adjusted R-squared: 0.01278
## F-statistic: 13.87 on 2 and 1987 DF, p-value: 1.038e-06
summary(lm(TARGET_WINS ~ TEAM_FIELDING_DP + TEAM_PITCHING_H, dfTrain_ImputedMedian))
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_DP + TEAM_PITCHING_H,
## data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.237 -9.564 0.855 10.359 68.964
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.1139149 2.2975487 37.916 < 2e-16 ***
## TEAM_FIELDING_DP -0.0271240 0.0147930 -1.834 0.0669 .
## TEAM_PITCHING_H -0.0012921 0.0002317 -5.576 2.76e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.46 on 2171 degrees of freedom
## Multiple R-squared: 0.01493, Adjusted R-squared: 0.01402
## F-statistic: 16.45 on 2 and 2171 DF, p-value: 8.127e-08
summary(lm(TARGET_WINS ~ TEAM_FIELDING_DP*TEAM_PITCHING_H, dfTrain))
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_DP * TEAM_PITCHING_H,
## data = dfTrain)
##
## Residuals:
## Min 1Q Median 3Q Max
## -69.126 -9.261 1.004 9.713 47.202
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.023e+02 5.724e+00 17.872 < 2e-16 ***
## TEAM_FIELDING_DP -2.549e-01 4.914e-02 -5.188 2.35e-07 ***
## TEAM_PITCHING_H -1.244e-02 3.269e-03 -3.806 0.000145 ***
## TEAM_FIELDING_DP:TEAM_PITCHING_H 1.561e-04 2.970e-05 5.257 1.62e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.76 on 1986 degrees of freedom
## (286 observations deleted due to missingness)
## Multiple R-squared: 0.02731, Adjusted R-squared: 0.02584
## F-statistic: 18.59 on 3 and 1986 DF, p-value: 6.864e-12
summary(lm(TARGET_WINS ~ TEAM_FIELDING_DP*TEAM_PITCHING_H, dfTrain_ImputedMedian))
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_DP * TEAM_PITCHING_H,
## data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.162 -9.515 0.820 10.312 69.257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.833e+01 5.757e+00 13.607 <2e-16 ***
## TEAM_FIELDING_DP 3.302e-02 3.906e-02 0.845 0.3981
## TEAM_PITCHING_H 3.513e-03 2.898e-03 1.212 0.2256
## TEAM_FIELDING_DP:TEAM_PITCHING_H -3.328e-05 2.001e-05 -1.663 0.0964 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.46 on 2170 degrees of freedom
## Multiple R-squared: 0.01618, Adjusted R-squared: 0.01482
## F-statistic: 11.9 on 3 and 2170 DF, p-value: 9.984e-08
The interaction temr makes a difference.
Taking a log of Pitching_H:
ggplot(dfTrain_ImputedMedian, aes(dfTrain_ImputedMedian$TEAM_PITCHING_H)) +
geom_histogram(bins=100)
## Warning: Use of `dfTrain_ImputedMedian$TEAM_PITCHING_H` is discouraged. Use
## `TEAM_PITCHING_H` instead.
dfTrain_ImputedMedian5 <- dfTrain_ImputedMedian2 %>%
mutate(logPitch_h = TEAM_PITCHING_H^2)
ggplot(dfTrain_ImputedMedian5, aes(logPitch_h, TARGET_WINS)) +
stat_smooth(method=loess) +
geom_point()
## `geom_smooth()` using formula 'y ~ x'
m <- lm(TARGET_WINS ~ TEAM_PITCHING_H + logPitch_h, dfTrain_ImputedMedian5)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H + logPitch_h, data = dfTrain_ImputedMedian5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -63.631 -9.694 1.045 10.242 64.174
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.944e+01 9.013e-01 88.133 < 2e-16 ***
## TEAM_PITCHING_H 1.126e-03 5.376e-04 2.094 0.0364 *
## logPitch_h -1.313e-07 2.682e-08 -4.897 1.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.39 on 2171 degrees of freedom
## Multiple R-squared: 0.02418, Adjusted R-squared: 0.02328
## F-statistic: 26.9 on 2 and 2171 DF, p-value: 2.895e-12
plot(m)
A closer look at Pitching_h. Taking out th outliers.
dfTrain_ImputedMedian6 <- dfTrain_ImputedMedian5 %>%
dplyr::filter(TEAM_PITCHING_H <= 1500)
dfTrain_ImputedMedian7 <- dfTrain_ImputedMedian5 %>%
dplyr::filter(TEAM_PITCHING_H > 2000)
ggplot(dfTrain_ImputedMedian6, aes(TEAM_PITCHING_H, TARGET_WINS)) +
stat_smooth(method=loess) +
geom_point()
## `geom_smooth()` using formula 'y ~ x'
m <- lm(TARGET_WINS ~ TEAM_PITCHING_H, dfTrain_ImputedMedian6)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H, data = dfTrain_ImputedMedian6)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.864 -8.396 0.413 8.870 30.267
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.26774 8.46728 0.976 0.329
## TEAM_PITCHING_H 0.04990 0.00602 8.289 3.78e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.22 on 970 degrees of freedom
## Multiple R-squared: 0.06614, Adjusted R-squared: 0.06518
## F-statistic: 68.7 on 1 and 970 DF, p-value: 3.785e-16
plot(m)
ggplot(dfTrain_ImputedMedian7, aes(TEAM_PITCHING_H, TARGET_WINS)) +
stat_smooth(method=loess) +
geom_point()
## `geom_smooth()` using formula 'y ~ x'
m <- lm(TARGET_WINS ~ TEAM_PITCHING_H, dfTrain_ImputedMedian7)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H, data = dfTrain_ImputedMedian7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.879 -13.887 2.392 15.885 65.947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90.487384 2.180193 41.50 < 2e-16 ***
## TEAM_PITCHING_H -0.002207 0.000418 -5.28 2.77e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23.87 on 255 degrees of freedom
## Multiple R-squared: 0.09855, Adjusted R-squared: 0.09502
## F-statistic: 27.88 on 1 and 255 DF, p-value: 2.767e-07
plot(m)
ggplot(dfTrain_ImputedMedian, aes(TEAM_PITCHING_H, TARGET_WINS)) +
stat_smooth(method=loess) +
geom_point()
## `geom_smooth()` using formula 'y ~ x'
m <- lm(TARGET_WINS ~ TEAM_PITCHING_H, dfTrain_ImputedMedian)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H, data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.165 -9.462 0.897 10.651 68.914
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.0150688 0.5308401 156.384 < 2e-16 ***
## TEAM_PITCHING_H -0.0012543 0.0002309 -5.432 6.2e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.47 on 2172 degrees of freedom
## Multiple R-squared: 0.0134, Adjusted R-squared: 0.01295
## F-statistic: 29.5 on 1 and 2172 DF, p-value: 6.205e-08
plot(m)
Eliminting outliers has no effect - but show outliers seem to be grouped (compare new outliers with old):
dfTrain_ImputedMedian_nooutliers <- dfTrain_ImputedMedian %>%
dplyr::filter(INDEX != 1211 & INDEX != 1342 & INDEX != 1810)
m <- lm(TARGET_WINS ~ TEAM_PITCHING_H, dfTrain_ImputedMedian_nooutliers)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H, data = dfTrain_ImputedMedian_nooutliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.170 -9.460 0.889 10.636 68.905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.0181857 0.5306250 156.45 < 2e-16 ***
## TEAM_PITCHING_H -0.0012530 0.0002307 -5.43 6.26e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.46 on 2169 degrees of freedom
## Multiple R-squared: 0.01341, Adjusted R-squared: 0.01296
## F-statistic: 29.49 on 1 and 2169 DF, p-value: 6.263e-08
plot(m)
looking for interactions:
par(mfcol=c(2,2))
dfTrain_ImputedMedian8 <- dfTrain_ImputedMedian %>%
mutate(Pitch_h_Under1500 = ifelse(TEAM_PITCHING_H<=1500, 1, 0))
dfTrain_ImputedMedian8$Pitch_h_Under1500 <- as.factor(dfTrain_ImputedMedian8$Pitch_h_Under1500)
for(i in 2:ncol(dfTrain_ImputedMedian8)) {
print(ggplot(dfTrain_ImputedMedian8, aes(dfTrain_ImputedMedian8[ ,i], TARGET_WINS, color=Pitch_h_Under1500)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE) +
ggtitle(colnames(dfTrain_ImputedMedian8)[i]))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
Similar analysis with the data missing records:
dfTrain_flag <- dfTrain2 %>%
mutate(Missing_Flag = ifelse(is.na(TEAM_BATTING_SO),1,0))
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_flag)
step1 <- stepAIC(mod_2, trace=FALSE)
summary(step1)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_HBP +
## TEAM_PITCHING_HR + TEAM_PITCHING_BB + TEAM_PITCHING_SO +
## TEAM_FIELDING_E + TEAM_FIELDING_DP, data = dfTrain_flag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.2248 -5.6294 -0.0212 5.0439 21.3065
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 60.95454 19.10292 3.191 0.001670 **
## TEAM_BATTING_H 0.02541 0.01009 2.518 0.012648 *
## TEAM_BATTING_HBP 0.08712 0.04852 1.796 0.074211 .
## TEAM_PITCHING_HR 0.08945 0.02394 3.736 0.000249 ***
## TEAM_PITCHING_BB 0.05672 0.00940 6.034 8.66e-09 ***
## TEAM_PITCHING_SO -0.03136 0.00728 -4.308 2.68e-05 ***
## TEAM_FIELDING_E -0.17218 0.03970 -4.338 2.38e-05 ***
## TEAM_FIELDING_DP -0.11904 0.03516 -3.386 0.000869 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.422 on 183 degrees of freedom
## (2080 observations deleted due to missingness)
## Multiple R-squared: 0.5345, Adjusted R-squared: 0.5167
## F-statistic: 30.02 on 7 and 183 DF, p-value: < 2.2e-16
Only interaction appears with the fielding_errors. Hwoever, If we interact with itself it greatly improves the r squared.
dfTrain_ImputedMedian9 <- dfTrain_ImputedMedian8 %>%
mutate(Pitch_h_squared = TEAM_PITCHING_H^2) %>%
mutate(Pitch_h_log = log(TEAM_PITCHING_H)) %>%
mutate(Pitch_h_sqrt = sqrt(TEAM_PITCHING_H))
summary(lm(TARGET_WINS ~ Pitch_h_squared, dfTrain_ImputedMedian9))
##
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_squared, data = dfTrain_ImputedMedian9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.015 -9.069 0.997 10.158 66.609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.119e+01 3.359e-01 241.736 < 2e-16 ***
## Pitch_h_squared -8.054e-08 1.147e-08 -7.024 2.88e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.4 on 2172 degrees of freedom
## Multiple R-squared: 0.02221, Adjusted R-squared: 0.02176
## F-statistic: 49.33 on 1 and 2172 DF, p-value: 2.883e-12
summary(lm(TARGET_WINS ~ Pitch_h_log, dfTrain_ImputedMedian9))
##
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_log, data = dfTrain_ImputedMedian9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -78.408 -9.582 1.145 10.356 66.161
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.2807 7.9389 10.994 <2e-16 ***
## Pitch_h_log -0.8795 1.0706 -0.822 0.411
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.57 on 2172 degrees of freedom
## Multiple R-squared: 0.0003106, Adjusted R-squared: -0.0001496
## F-statistic: 0.6749 on 1 and 2172 DF, p-value: 0.4114
summary(lm(TARGET_WINS ~ Pitch_h_sqrt, dfTrain_ImputedMedian9))
##
## Call:
## lm(formula = TARGET_WINS ~ Pitch_h_sqrt, data = dfTrain_ImputedMedian9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67.753 -9.477 0.982 10.732 68.378
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85.48013 1.47144 58.09 < 2e-16 ***
## Pitch_h_sqrt -0.11429 0.03474 -3.29 0.00102 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.54 on 2172 degrees of freedom
## Multiple R-squared: 0.00496, Adjusted R-squared: 0.004501
## F-statistic: 10.83 on 1 and 2172 DF, p-value: 0.001017
m <- lm(TARGET_WINS ~ TEAM_PITCHING_H*Pitch_h_Under1500, dfTrain_ImputedMedian8)
summary(m)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_PITCHING_H * Pitch_h_Under1500,
## data = dfTrain_ImputedMedian8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.864 -9.153 0.979 9.772 67.940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.643e+01 6.550e-01 131.965 < 2e-16 ***
## TEAM_PITCHING_H -1.771e-03 2.322e-04 -7.628 3.55e-14 ***
## Pitch_h_Under15001 -7.816e+01 1.047e+01 -7.466 1.19e-13 ***
## TEAM_PITCHING_H:Pitch_h_Under15001 5.167e-02 7.432e-03 6.952 4.76e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.08 on 2170 degrees of freedom
## Multiple R-squared: 0.06361, Adjusted R-squared: 0.06232
## F-statistic: 49.14 on 3 and 2170 DF, p-value: < 2.2e-16
plot(m)
summary(lm(TARGET_WINS ~ TEAM_FIELDING_E*Pitch_h_Under1500, dfTrain_ImputedMedian9))
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E * Pitch_h_Under1500,
## data = dfTrain_ImputedMedian9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.182 -9.571 0.598 9.826 73.499
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.867745 0.643380 136.572 < 2e-16 ***
## TEAM_FIELDING_E -0.016158 0.001498 -10.787 < 2e-16 ***
## Pitch_h_Under15001 -0.776515 1.469068 -0.529 0.597
## TEAM_FIELDING_E:Pitch_h_Under15001 -0.042078 0.008364 -5.031 5.28e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.88 on 2170 degrees of freedom
## Multiple R-squared: 0.08892, Adjusted R-squared: 0.08766
## F-statistic: 70.59 on 3 and 2170 DF, p-value: < 2.2e-16
summary(lm(TARGET_WINS ~ TEAM_FIELDING_E, dfTrain_ImputedMedian9))
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_FIELDING_E, data = dfTrain_ImputedMedian9)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.638 -9.847 0.708 10.050 73.590
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.645750 0.476605 175.503 <2e-16 ***
## TEAM_FIELDING_E -0.011815 0.001415 -8.352 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.33 on 2172 degrees of freedom
## Multiple R-squared: 0.03112, Adjusted R-squared: 0.03067
## F-statistic: 69.75 on 1 and 2172 DF, p-value: < 2.2e-16
Final Mods:
dfTrain_ImputedMedian8$Pitch_h_Under1500 <- as.numeric(dfTrain_ImputedMedian8$Pitch_h_Under1500)
dfTrain_ImputedMedian10 <- dfTrain_ImputedMedian8 %>%
mutate(Prod_DP_H = TEAM_FIELDING_DP*TEAM_PITCHING_H) %>%
mutate(inter_H_Itself = TEAM_PITCHING_H*Pitch_h_Under1500) %>%
mutate(Inter_H_Err = TEAM_FIELDING_E*Pitch_h_Under1500) %>%
mutate(TEAM_PITCHING_H = log(TEAM_PITCHING_H)) %>%
mutate(E_sq = TEAM_FIELDING_E^2)
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
summary(mod_2)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfTrain_ImputedMedian)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.264 -8.466 0.163 8.273 58.924
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.9560970 5.4876280 4.365 1.33e-05 ***
## INDEX -0.0004771 0.0003788 -1.259 0.207988
## TEAM_BATTING_H 0.0482928 0.0037112 13.013 < 2e-16 ***
## TEAM_BATTING_2B -0.0232530 0.0092311 -2.519 0.011841 *
## TEAM_BATTING_3B 0.0595670 0.0169134 3.522 0.000437 ***
## TEAM_BATTING_HR 0.0655424 0.0272468 2.406 0.016234 *
## TEAM_BATTING_BB 0.0084691 0.0057882 1.463 0.143567
## TEAM_BATTING_SO -0.0100510 0.0025721 -3.908 9.61e-05 ***
## TEAM_BASERUN_SB 0.0254437 0.0044746 5.686 1.47e-08 ***
## TEAM_BASERUN_CS 0.0006521 0.0161429 0.040 0.967780
## TEAM_PITCHING_H -0.0009865 0.0003651 -2.702 0.006949 **
## TEAM_PITCHING_HR 0.0116273 0.0240289 0.484 0.628514
## TEAM_PITCHING_BB 0.0014808 0.0040999 0.361 0.718000
## TEAM_PITCHING_SO 0.0028141 0.0009069 3.103 0.001941 **
## TEAM_FIELDING_E -0.0186779 0.0024906 -7.499 9.31e-14 ***
## TEAM_FIELDING_DP -0.1091373 0.0136377 -8.003 1.97e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.86 on 2158 degrees of freedom
## Multiple R-squared: 0.3226, Adjusted R-squared: 0.3179
## F-statistic: 68.5 on 15 and 2158 DF, p-value: < 2.2e-16
par(mfcol=c(2,2))
mod_2 <- lm(TARGET_WINS ~ ., data = dfTrain_ImputedMedian10)
summary(mod_2)
##
## Call:
## lm(formula = TARGET_WINS ~ ., data = dfTrain_ImputedMedian10)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.102 -8.210 0.314 8.312 61.874
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.650e+02 3.613e+01 -4.567 5.22e-06 ***
## INDEX -4.911e-04 3.746e-04 -1.311 0.189965
## TEAM_BATTING_H 3.562e-02 4.340e-03 8.208 3.82e-16 ***
## TEAM_BATTING_2B -2.155e-02 9.260e-03 -2.327 0.020033 *
## TEAM_BATTING_3B 8.171e-02 1.760e-02 4.644 3.63e-06 ***
## TEAM_BATTING_HR 1.203e-01 3.066e-02 3.925 8.95e-05 ***
## TEAM_BATTING_BB 2.204e-02 6.083e-03 3.622 0.000299 ***
## TEAM_BATTING_SO -1.028e-02 2.686e-03 -3.827 0.000133 ***
## TEAM_BASERUN_SB 2.739e-02 4.636e-03 5.908 4.02e-09 ***
## TEAM_BASERUN_CS 5.357e-03 1.621e-02 0.330 0.741104
## TEAM_PITCHING_H 2.612e+01 5.215e+00 5.008 5.95e-07 ***
## TEAM_PITCHING_HR -4.297e-02 2.757e-02 -1.559 0.119245
## TEAM_PITCHING_BB -5.488e-03 4.231e-03 -1.297 0.194696
## TEAM_PITCHING_SO 2.273e-03 9.513e-04 2.389 0.016979 *
## TEAM_FIELDING_E 7.044e-03 1.113e-02 0.633 0.526826
## TEAM_FIELDING_DP -8.680e-02 3.424e-02 -2.535 0.011307 *
## Pitch_h_Under1500 1.014e+01 4.512e+00 2.246 0.024785 *
## Prod_DP_H -6.911e-06 1.826e-05 -0.379 0.705076
## inter_H_Itself -2.000e-03 2.911e-03 -0.687 0.492098
## Inter_H_Err -2.725e-02 8.008e-03 -3.403 0.000678 ***
## E_sq -7.588e-06 4.520e-06 -1.679 0.093370 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.7 on 2153 degrees of freedom
## Multiple R-squared: 0.3408, Adjusted R-squared: 0.3347
## F-statistic: 55.66 on 20 and 2153 DF, p-value: < 2.2e-16
plot(mod_2)
step1 <- stepAIC(mod_2, trace=FALSE)
summary(step1)
##
## Call:
## lm(formula = TARGET_WINS ~ TEAM_BATTING_H + TEAM_BATTING_2B +
## TEAM_BATTING_3B + TEAM_BATTING_HR + TEAM_BATTING_BB + TEAM_BATTING_SO +
## TEAM_BASERUN_SB + TEAM_PITCHING_H + TEAM_PITCHING_HR + TEAM_PITCHING_SO +
## TEAM_FIELDING_DP + Pitch_h_Under1500 + inter_H_Itself + Inter_H_Err +
## E_sq, data = dfTrain_ImputedMedian10)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.562 -8.169 0.338 8.309 61.515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.647e+02 3.089e+01 -5.331 1.08e-07 ***
## TEAM_BATTING_H 3.681e-02 4.250e-03 8.660 < 2e-16 ***
## TEAM_BATTING_2B -2.174e-02 9.232e-03 -2.355 0.018605 *
## TEAM_BATTING_3B 8.082e-02 1.730e-02 4.671 3.18e-06 ***
## TEAM_BATTING_HR 1.350e-01 2.803e-02 4.818 1.55e-06 ***
## TEAM_BATTING_BB 1.541e-02 3.369e-03 4.575 5.03e-06 ***
## TEAM_BATTING_SO -9.744e-03 2.630e-03 -3.705 0.000217 ***
## TEAM_BASERUN_SB 2.693e-02 4.281e-03 6.291 3.81e-10 ***
## TEAM_PITCHING_H 2.610e+01 4.390e+00 5.946 3.20e-09 ***
## TEAM_PITCHING_HR -5.769e-02 2.526e-02 -2.284 0.022461 *
## TEAM_PITCHING_SO 1.612e-03 7.791e-04 2.068 0.038723 *
## TEAM_FIELDING_DP -9.966e-02 1.364e-02 -7.304 3.91e-13 ***
## Pitch_h_Under1500 1.138e+01 1.510e+00 7.532 7.28e-14 ***
## inter_H_Itself -3.339e-03 5.513e-04 -6.057 1.63e-09 ***
## Inter_H_Err -2.277e-02 4.433e-03 -5.136 3.05e-07 ***
## E_sq -5.730e-06 3.596e-06 -1.593 0.111234
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.7 on 2158 degrees of freedom
## Multiple R-squared: 0.3396, Adjusted R-squared: 0.335
## F-statistic: 73.97 on 15 and 2158 DF, p-value: < 2.2e-16
Checking interactions with the missing vaolues cohort:
looking for interactions:
par(mfcol=c(2,2))
dfTrain_ImputedMean$Missing_Flag <- as.factor(dfTrain_ImputedMean$Missing_Flag)
for(i in 2:ncol(dfTrain_ImputedMean)) {
print(ggplot(dfTrain_ImputedMean, aes(dfTrain_ImputedMean[ ,i], TARGET_WINS, color=Missing_Flag)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE) +
ggtitle(colnames(dfTrain_ImputedMean)[i]))
}
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'